home *** CD-ROM | disk | FTP | other *** search
/ Aminet 31 / Aminet 31 (1999)(Schatztruhe)[!][Jun 1999].iso / Aminet / comm / tcp / GetAllHTML.lha / GetAllHTML.rexx < prev   
Encoding:
OS/2 REXX Batch file  |  1999-04-04  |  20.3 KB  |  571 lines

  1. /* GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE,DEPTH=/N/K,PORT="/K",BASEURL=/K,BROKENLINKS/S
  2.   v0.64ß (04-04-99)  Copyright 1999 Chris S Handley
  3.   (email: Chris.S.Handley@btinternet.com)
  4.  
  5.    This is going to be converted to a super-fast AmigaE version, so I am not
  6.   working on this (much) anymore.  If you alter & distribute this, please
  7.   mention me as the original author!  However, I would prefer you send me any
  8.   suggestions to be used in the E version...
  9.  
  10.    See GetAllHTML.doc for more details
  11. */
  12.  
  13. OPTIONS RESULTS
  14. Call Addlib('rexxsupport.library',0,-30,0)
  15.  
  16. Say 'GetAllHTML v0.64ß  Copyright 1998-9 Chris Handley (read program file for details)'
  17.  
  18.  /* set-up */
  19. HTTPResume='Programs:Utils/Comms/HTTPResume'
  20.  
  21. ExtDir='T:'
  22. TempFile='T:GetAllHTML'
  23. TempFileAdd = Random(1,999,Time(s))
  24. DO UNTIL ~Exists(TempFile||TempFileAdd)
  25.     TempFileAdd = Random(1,999,Time(s))
  26. END
  27. TempFile=TempFile||TempFileAdd
  28.  
  29.  /* deal with args */
  30. Parse VALUE Arg(1) WITH '"' MainURL '"' . '"' DestDir '"' Switch1 Switch2 Switch3 Switch4 Switch5 Switch6 Switch7 Switch8 Switch9 Switch10
  31. IF (MainURL='')|(DestDir='') THEN DO
  32.     Say 'ERROR:  Empty argument(s)!'
  33.         Say 'Usage:  GetAllHTML "URL"/A,"DestDir"/A,NOASK/S,ARC/S,PIC/S,RESUME/S,PAUSE/S,TERSE/S,DEPTH=/N/K,PORT=/K,BASEURL=/K,BROKENLINKS/S'
  34.     Say 'Note - both URL & DestDir *must* be enclosed in "double quotes".'
  35.     Say '     - after DEPTH should be a "=" followed by a number with NO spaces between them.'
  36.     Say '     - after PORT should be a "=" followed by a string with NO spaces between them.'
  37.     Say '     - after BASEURL should be a "=" followed by a string with NO spaces between them.'
  38.     Exit 20
  39. END
  40. IF (Right(DestDir,1)~='/')&(Right(DestDir,1)~=':') THEN DestDir=DestDir||'/'
  41. CALL MakeDir(DestDir)
  42. IF Left(MainURL,7)~='http://' THEN MainURL='http://'||MainURL
  43. Switch1=Upper(Switch1); Switch2=Upper(Switch2); Switch3=Upper(Switch3); Switch4=Upper(Switch4); Switch5=Upper(Switch5); Switch6=Upper(Switch6); Switch7=Upper(Switch7); Switch8=Upper(Switch8); Switch9=Upper(Switch9); Switch10=Upper(Switc10)
  44.  
  45. SwNoAsk=0; SwArc=0; SwPic=0; SwResume=0; SwDepth=30; SwNoPause=1; SwPort=0; SwTerse=0; BaseURLDir=''; SwBroken=0
  46.  
  47. IF (Switch1='NOASK')|(Switch2='NOASK')|(Switch3='NOASK')|(Switch4='NOASK')|(Switch5='NOASK')|(Switch6='NOASK')|(Switch7='NOASK')|(Switch8='NOASK')|(Switch9='NOASK')|(Switch10='NOASK') THEN SwNoAsk=1
  48. IF (Switch1='ARC')|(Switch2='ARC')|(Switch3='ARC')|(Switch4='ARC')|(Switch5='ARC')|(Switch6='ARC')|(Switch7='ARC')|(Switch8='ARC')|(Switch9='ARC')|(Switch10='ARC') THEN SwArc=1
  49. IF (Switch1='PIC')|(Switch2='PIC')|(Switch3='PIC')|(Switch4='PIC')|(Switch5='PIC')|(Switch6='PIC')|(Switch7='PIC')|(Switch8='PIC')|(Switch9='PIC')|(Switch10='PIC') THEN SwPic=1
  50. IF (Switch1='RESUME')|(Switch2='RESUME')|(Switch3='RESUME')|(Switch4='RESUME')|(Switch5='RESUME')|(Switch6='RESUME')|(Switch7='RESUME')|(Switch8='RESUME')|(Switch9='RESUME')|(Switch10='RESUME') THEN SwResume=1
  51. IF (Switch1='PAUSE')|(Switch2='PAUSE')|(Switch3='PAUSE')|(Switch4='PAUSE')|(Switch5='PAUSE')|(Switch6='PAUSE')|(Switch7='PAUSE')|(Switch8='PAUSE')|(Switch9='PAUSE')|(Switch10='PAUSE') THEN SwNoPause=0
  52. IF (Switch1='TERSE')|(Switch2='TERSE')|(Switch3='TERSE')|(Switch4='TERSE')|(Switch5='TERSE')|(Switch6='TERSE')|(Switch7='TERSE')|(Switch8='TERSE')|(Switch9='TERSE')|(Switch10='TERSE') THEN SwTerse=1
  53. IF (Switch1='BROKENLINKS')|(Switch2='BROKENLINKS')|(Switch3='BROKENLINKS')|(Switch4='BROKENLINKS')|(Switch5='BROKENLINKS')|(Switch6='BROKENLINKS')|(Switch7='BROKENLINKS')|(Switch8='BROKENLINKS')|(Switch9='BROKENLINKS')|(Switch10='BROKENLINKS') THEN SwBroken=1
  54. IF (Left(Switch1,5)='DEPTH')|(Left(Switch2,5)='DEPTH')|(Left(Switch3,5)='DEPTH')|(Left(Switch4,5)='DEPTH')|(Left(Switch5,5)='DEPTH')|(Left(Switch6,5)='DEPTH')|(Left(Switch7,5)='DEPTH')|(Left(Switch8,5)='DEPTH')|(Left(Switch9,5)='DEPTH')|(Left(Switch10,5)='DEPTH') THEN DO
  55.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'DEPTH=' Depth .
  56.     IF Depth='' THEN DO
  57.         Say 'No DEPTH number found (must use "DEPTH=x" where x is your number).'
  58.         Say 'Search pages up to a depth of: '
  59.         Pull Depth
  60.     END
  61.  
  62.     IF Depth>42 THEN Depth=42    /* sanity protect against ARexx limitation */
  63.     IF Depth<10 THEN
  64.         SwDepth=Depth*2    /* since each grows by 2 each depth (e.g.".2.3.4.5") */
  65.     ELSE
  66.         SwDepth=((Depth-9)*3)+(9*2) /* as above but above 9 grows by 3 (e.g.".12.13.14.15") */
  67.     SwDepth=SwDepth+5-2    /* 5 = length of "Root." */
  68. END
  69. IF (Left(Switch1,4)='PORT')|(Left(Switch2,4)='PORT')|(Left(Switch3,4)='PORT')|(Left(Switch4,4)='PORT')|(Left(Switch5,4)='PORT')|(Left(Switch6,4)='PORT')|(Left(Switch7,4)='PORT')|(Left(Switch8,4)='PORT')|(Left(Switch9,4)='PORT')|(Left(Switch10,4)='PORT') THEN DO
  70.     SwPort=1
  71.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'PORT=' Port .
  72.     IF Port='' THEN DO
  73.         Port=Address()
  74.         IF Left(Port,11)~='HTTPRESUME.' THEN DO
  75.             Say 'ERROR:  PORT argument was not followed by a = and a string with no spaces between (eg."PORT=HTTPResume.1"), and the host enviroment was not already HTTPResume!'
  76.             Exit 20
  77.         END
  78.     END
  79.  END
  80. ELSE Port='' /*probably not necessary*/
  81. IF (Left(Switch1,7)='BASEURL')|(Left(Switch2,7)='BASEURL')|(Left(Switch3,7)='BASEURL')|(Left(Switch4,7)='BASEURL')|(Left(Switch5,7)='BASEURL')|(Left(Switch6,7)='BASEURL')|(Left(Switch7,7)='BASEURL')|(Left(Switch8,7)='BASEURL')|(Left(Switch9,7)='BASEURL')|(Left(Switch10,7)='BASEURL') THEN DO
  82.     Parse VALUE Upper(Arg(1)) WITH '"' . '"' . '"' . '"' . 'BASEURL=' BaseURLDir .
  83.     IF BaseURLDir='' THEN DO
  84.         Say 'ERROR:  BASEURL argument was not followed by a = and a string with no spaces between (eg."BASEURL=www.amiga.com")!'
  85.         Exit 20
  86.     END
  87.     BaseURLDir=SubStr(Arg(1),Index(Upper(Arg(1)),BaseURLDir),Length(BaseURLDir))
  88.     IF Right(BaseURLDir,1)~='/' THEN BaseURLDir=BaseURLDir||'/'
  89.  END
  90. ELSE DO
  91.     Parse VALUE Reverse(MainURL) WITH . '/' BaseURLDir
  92.     IF Length(BaseURLDir)<8 THEN BaseURLDir = Reverse(MainURL)     /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  93.     BaseURLDir=Reverse(BaseURLDir)||'/'
  94. END
  95.  
  96. If SwResume=1 THEN Say 'NOTE:  Resume mode activated!'
  97. If SwBroken=1 THEN Say 'NOTE:  Broken-link detection mode activated!'
  98.  
  99. IF Port='' THEN DO
  100.      /* run HTTPResume & set-up related stuff; OVERWRITE cause problems (restart from scratch if fails in the middle) */
  101.     Address Command 'Run >Nil: '||HTTPResume||' GUI NODATECHECK AUTORESUME STARTICONIFIED QUICKQUIT NOERRREQ RXPORTFILE='||TempFile /*NOENV removed*/
  102.     Say 'Waiting for HTTPResume...'
  103.     DO UNTIL Exists(TempFile)
  104.         Delay(25)
  105.     END
  106.     Delay(100)
  107.     IF ~Open(.port, TempFile, 'READ') THEN DO
  108.         Say 'ERROR:  Could not open "'||TempFile||'"!'
  109.         Exit 20
  110.     END
  111.     Port=ReadLn(.port)
  112.     Call Close(.port)
  113.     Call Delete(TempFile)
  114.     IF Port='***' THEN DO
  115.         Say 'ERROR:  HTTPResume could not open it''s ARexx port!'
  116.         Exit 20
  117.     END
  118.     Address(Port)
  119.  END
  120. ELSE DO
  121.     Address(Port)
  122. /*    SET OVERWRITE*/
  123.     SET NODATECHECK
  124.     SET AUTORESUME
  125.     SET QUICKQUIT
  126.     SET NOERRREQ
  127. END
  128.  
  129.  /* init set-up */
  130. Root.0=1
  131. Root.1=MainURL
  132. Root.1.HTML=1
  133.  
  134. ModemOnLine=0
  135. LastSuffix=''    /*record of suffix of last user confirmed file download - so semi-intelligent! */
  136.  
  137.  /* get all pages recurcively */
  138. Say 'Downloading & scanning pages...'
  139. CALL DownloadList('Root.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken)
  140.  
  141. Say 'Finished.'
  142. IF SwPort=0 THEN QUIT
  143. Exit
  144.  
  145. DownloadList: PROCEDURE EXPOSE Root. Resume. ModemOnLine LastSuffix ExtDir
  146.     /* DownloadList(URLList,DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken) */
  147.      /* grab args */
  148.     URLList=Arg(1)
  149.     DestDir=Arg(2)
  150.     BaseURLDir=Arg(3)
  151.     SwNoAsk=Arg(4)
  152.     SwArc=Arg(5)
  153.     SwPic=Arg(6)
  154.     SwResume=Arg(7)
  155.     SwDepth=Arg(8)
  156.     SwNoPause=Arg(9)
  157.     SwTerse=Arg(10)
  158.     SwBroken=Arg(11)
  159.  
  160.     INTERPRET 'URLListSize='||URLList||'0'
  161.  
  162.      /* deal with each URL in list in turn */
  163.     IF Length(URLList)>SwDepth THEN
  164.         NOP
  165.      ELSE DO
  166.         IF URLListSize>0 THEN DO
  167. /*Say '-Length('URLList')='||Length(URLList)*/
  168.             DO i=1 TO URLListSize
  169.                 NewURLList=URLList||i
  170.                 INTERPRET 'URL='||NewURLList
  171.                 INTERPRET 'HTMLfile='||NewURLList||'.HTML'
  172.  
  173.                 INTERPRET 'ExternalLink='||NewURLList||'.EXT'
  174.                 IF ExternalLink~=1 THEN ExternalLink=0
  175.  
  176.                  /* decide on relative file & path */
  177.                 IF ExternalLink=0 THEN
  178.                      /* find local path */
  179.                     Parse VAR URL (BaseURLDir) PathFile
  180.                 ELSE DO
  181.                      /* outside normal search (external) - set PathFile as just file */
  182.                     Parse VALUE Reverse(URL) WITH PathFile '/' .
  183.                     PathFile=Reverse(PathFile)
  184.                 END
  185.                 IF (Right(PathFile,1)='/')|(PathFile='') THEN DO
  186.                     PathFile=PathFile||'InDeX.hTmL'    /* give filename-less pages a name */
  187.                     HTMLfile=1            /* force attempted scanning for HTMLs */
  188.                     GuessedURL=1
  189.                  END
  190.                 ELSE
  191.                     GuessedURL=0
  192.                 Parse VALUE Reverse(PathFile) WITH File '/' Path
  193.                 File=Reverse(File)
  194.                 Path=Reverse(Path)
  195.                 IF Path='' THEN DO
  196.                     File=PathFile
  197.                     Path=''
  198.                 END
  199.  
  200.                  /* create necessary dir(s) */
  201.                 PathLeft=Path                    /* use URL minus file at end */
  202.                 CurPath=DestDir
  203.                 DO While PathLeft~=''
  204.                     Parse VALUE PathLeft WITH NewDir '/' PathLeft
  205.                     IF NewDir~=='' THEN DO
  206.                         CurPath=CurPath||NewDir||'/'
  207.                         CALL MakeDir(Left(CurPath,Length(CurPath)-1))
  208.                      END
  209.                     ELSE DO
  210.                         IF SwTerse=0 THEN DO
  211.                             IF SwNoPause=0 THEN DO
  212.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'" (press <return>)'
  213.                                 Pull Input
  214.                              END
  215.                             ELSE
  216.                                 Say 'WARNING:  Empty dir name in URL "'||URL||'"'
  217.                         END
  218.                     END
  219.                 END
  220.                 IF ExternalLink=0 THEN
  221.                     DownloadFile=DestDir||PathFile
  222.                 ELSE
  223.                     DownloadFile=ExtDir||PathFile
  224.  
  225.  
  226.                 IF SwResume~=0 THEN DO
  227.                     SeenBefore=0
  228.                     RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')
  229.                     IF Resume.RxDownloadFile=1 THEN SeenBefore=1
  230.  
  231.                     IF SeenBefore=0 THEN DO            /* if visited this page before then pass! */
  232.                         IF Exists(DownloadFile) THEN DO
  233.                             Resume.RxDownloadFile=1
  234.  
  235.                             IF HTMLfile=1 THEN DO
  236.                                  /* parse page for URLs into a list */
  237.                                 CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken)
  238.                                          /* download pages from list */
  239.                                 CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken)
  240.                             END
  241.                          END
  242.                         ELSE DO
  243.                             IF ExternalLink=0 THEN DO
  244.                                 SwResume=0    /* reached point did last time, now continue as before */
  245.                                 IF ModemOnLine=0 THEN DO    /* hack to ensure only halt for input once (so can leave alone) */
  246.                                     Say 'NOTE:  Reached point where left off! (press <return>)'
  247.                                     Pull Input
  248.                                 END
  249.                                 ModemOnLine=1
  250.                             END
  251.                         END
  252.  
  253.                     END
  254.                 END
  255.  
  256.                 IF (SwResume=0)|(ExternalLink=1) THEN DO
  257.                     IF ~Exists(DownloadFile) THEN DO    /* if visited this page before then pass! */
  258.                         IF ExternalLink=0 THEN DO
  259.                             /* download file */
  260.                             CALL GetHTML(URL,DownloadFile)
  261.  
  262.                             /* see if was downloaded */
  263.                             IF ~Exists(DownloadFile) THEN DO
  264.                                 IF SwTerse=0 THEN DO
  265.                                     IF GuessedURL~=1 THEN DO
  266.                                         IF SwNoPause=0 THEN DO
  267.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'" (press <return>)'
  268.                                             Pull Input
  269.                                          END
  270.                                         ELSE
  271.                                             Say 'WARNING:  Couldn''t download file "'||DownloadFile||'"'
  272.                                     END
  273.                                 END
  274.  
  275.                                 /* if not downloaded then place empty 'fake' file to stop RESUME stopping to early */
  276.                                 Call Open(.file, DownloadFile, 'WRITE')
  277.                                     Call Close(.file)
  278.                              END
  279.                             ELSE DO
  280.                                 /* scan downloaded file if asked to */
  281.                                 IF HTMLfile=1 THEN DO
  282.                                      /* parse page for URLs into a list */
  283.                                     CALL GetURLs(NewURLList||'.',DownloadFile,BaseURLDir,URL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken)
  284.                                              /* download pages from list */
  285.                                     CALL DownloadList(NewURLList||'.',DestDir,BaseURLDir,SwNoAsk,SwArc,SwPic,SwResume,SwDepth,SwNoPause,SwTerse,SwBroken)
  286.                                 END
  287.                             END
  288.                          END
  289.                         ELSE DO
  290.                             /* download file, only if haven't done before (hijacked how RESUME checks) */
  291.                             RxDownloadFile=Translate(DownloadFile,'abcdefghijklmnopqrstuvwxyz0123456789','abcdefghijklmnopqrstuvwxyz0123456789','_')
  292.                             IF Resume.RxDownloadFile~=1 THEN DO
  293.                                 Resume.RxDownloadFile=1
  294.  
  295.                                 /* download file */
  296.                                 CALL GetHTML(URL,DownloadFile)
  297.  
  298.                                 /* re-try downloading twice, incase 'freak' connect failure */
  299.                                 IF ~Exists(DownloadFile) THEN DO
  300.                                     CALL Delay(50)
  301.                                     CALL GetHTML(URL,DownloadFile)
  302.  
  303.                                     IF ~Exists(DownloadFile) THEN DO
  304.                                         CALL Delay(50)
  305.                                         CALL GetHTML(URL,DownloadFile)
  306.                                     END
  307.                                 END
  308.  
  309.                                 IF ~Exists(DownloadFile) THEN DO
  310.                                     Parse VALUE Reverse(URLList) WITH . '.' BrokePage
  311.                                     INTERPRET 'BrokePage='||Reverse(BrokePage)
  312.                                     Say 'Found BROKEN LINK to "'||URL||'" in "'||BrokePage||'"'
  313.                                  END
  314.                                 ELSE
  315.                                     CALL Delete(DownloadFile)
  316.                             END
  317.                         END
  318.                     END
  319.                 END
  320.             END
  321.         END
  322.     END
  323. Return
  324.  
  325. GetURLs: PROCEDURE EXPOSE Root. LastSuffix
  326.     /* GetURL(URLList,DownloadFile,BaseURLDir,FileURL,SwNoAsk,SwArc,SwPic,SwNoPause,SwTerse,SwBroken) */
  327.      /* get args */
  328.     URLList=Arg(1)
  329.     DownloadFile=Arg(2)
  330.     BaseURLDir=Arg(3)
  331.     FileURL=Arg(4)
  332.     SwNoAsk=Arg(5)
  333.     SwArc=Arg(6)
  334.     SwPic=Arg(7)
  335.     SwNoPause=Arg(8)
  336.     SwTerse=Arg(9)
  337.     SwBroken=Arg(10)
  338.  
  339.     INTERPRET URLList||'0 = 0'
  340.  
  341.      /* expand path to global, if is local reference like "/new/0083.html" */
  342.     Parse VALUE Reverse(FileURL) WITH . '/' LocalURLDir
  343.     IF Length(LocalURLDir)<8 THEN LocalURLDir = Reverse(FileURL)    /*check for cases like FileURL="http://www.kosh.net" - i.e.no end slash*/
  344.     LocalURLDir=Reverse(LocalURLDir)||'/'
  345.  
  346.     PARSE VAR LocalURLDir 'http://' LocalURLDomain '/' .    /*recover domain from URL*/
  347.     LocalURLDomain='http://'||LocalURLDomain
  348.  
  349.      /* parse (possibly) downloaded HTML file for URLs */
  350.     IF Open(.file, DownloadFile, 'READ') THEN DO
  351. /*Say 'Reading HTML file "'||DownloadFile||'"'*/
  352.         DO WHILE ~EOF(.file)
  353.              /* search for HTML ref. links */
  354.             Line=ReadLn(.file)
  355.             ULine=Upper(Line)
  356.  
  357.             NewPos=0; Mode=0
  358.             DO UNTIL NewPos<0
  359.                  /* non-frame search */
  360.                 IF Mode=0 THEN DO
  361.                     NewPos=Pos('<A HREF=',ULine,NewPos+1)
  362.                     IF NewPos=0 THEN DO
  363.                         Mode=1
  364.                         NewPos=0
  365.                     END
  366.                 END
  367.                  /* frame/image search */
  368.                 IF Mode=1 THEN DO
  369.                     Done=1
  370.                      /* "SRC=" occurs for both in frames & images */
  371.                     NewPos=Pos('SRC=',ULine,NewPos+1)
  372.                     IF NewPos=0 THEN NewPos=-1
  373.                 END
  374.  
  375.                  /* expand URL to full path, remove non-file parts & store only if inside parameters */
  376.                 IF NewPos>0 THEN DO
  377.                     Parse VAR Line =NewPos '="' URL '"'
  378.                     IF URL='' THEN Parse VAR Line =NewPos '=\"' URL '"'    /*javascripts precede "s by a slash*/
  379. /*Say '-Found URL "'||URL||'"'*/
  380.                     IF URL~=='' THEN DO
  381.                         Parse UPPER VAR URL URLDev ':' URLRest
  382.                         Download=1
  383.                         IF (URLRest~=='')&(URLDev~='HTTP') THEN DO
  384.                              /* found e.g. "mailto:" */
  385.                             IF SwTerse=0 THEN Say 'Found non-http link "'||URL||'"'
  386.                             Download=0
  387.                         END
  388.                         IF URLDev=Upper(URL) THEN DO
  389.                         /*    IF Left(URL,1)='/' THEN DO
  390.                                 URL=SubStr(URL,2)        /* remove pre-slash */
  391.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  392.                                     URL='/'||SubStr(URL,3)
  393.                                 END
  394.                                 /*???remove pre-slash (again)???*/
  395.                                 URL=LocalURLDomain||URL    /* pre-slash finally replaced by domain name */
  396.                              END
  397.                             ELSE DO
  398.                                 DO While Left(URL,2)='..'    /* convert "../" to "//" */
  399.                                     URL='/'||SubStr(URL,3)
  400.                                 END
  401.                                 IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  402.                                 URL=LocalURLDir||URL    /* local reference -> expand to full */
  403.                             END
  404.                         */
  405.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove pre-slash */
  406.                             DO While Left(URL,2)='..'            /* convert "../" to "//" */
  407.                                 URL='/'||SubStr(URL,3)
  408.                             END
  409.                             IF Left(URL,1)='/' THEN URL=SubStr(URL,2)    /* remove 1st spurious pre-slash (otherwise path wrongly interpreted) */
  410.                             URL=LocalURLDir||URL                /* local reference -> expand to full */
  411.  
  412.                              /* if have double-slashes (go down dir), then remove relevant dirs */
  413.                             Done=0
  414.                             DO Until Done=1            /* an algorithm with a bit of magic! */
  415.                                 URLLen=Length(URL)
  416.                                 EndDPos=Index(URL,'//',8)                /* marks end of '//' */
  417.                                 IF EndDPos>0 THEN DO
  418.                                     StartDPos=Index(Reverse(URL),'/',URLLen-EndDPos+2)
  419.                                     IF StartDPos>0 THEN DO
  420.                                         StartDPos=URLLen-StartDPos+1        /* marks 1st slash before '//' */
  421.                                         URL=Left(URL,StartDPos)||SubStr(URL,EndDPos+2)
  422.                                      END
  423.                                     ELSE
  424.                                         Done=1
  425.                                  END
  426.                                 ELSE
  427.                                     Done=1
  428.                             END
  429.                          END
  430.                         ELSE DO
  431.                             IF URLRest=='' THEN Download=0        /* nothing after ":" */
  432.                         END
  433.  
  434.                         IF Download=1 THEN DO
  435.                              /* remove "#search" from "http:path/file#search" */
  436.                             IF Index(URL,'#')~=0 THEN DO
  437.                                 Parse VALUE Reverse(URL) WITH . '#' URL
  438.                                 URL=Reverse(URL)
  439.                             END
  440.  
  441.                              /* used to check for suffix & if it is not part of e.g. www.amiga.com */
  442.                             Parse VALUE Reverse(URL) WITH URLFile '/' .
  443.                             Parse VAR URLFile Suffix '.' .
  444.                             URLFile=Reverse(URLFile)
  445.                             Suffix=Reverse(Suffix)
  446.                             DirSuffix=0
  447.                             GotSuffix=0
  448.                             IF Suffix~=URLFile THEN DO
  449.                                 GotSuffix=1
  450.                                 IF Index(Reverse(URL),'/')>(Length(URL)-7) THEN DO
  451.                                     DirSuffix=1
  452.                                     GotSuffix=0
  453.                                 END
  454.                             END
  455.  
  456.                             IF (GotSuffix=0)&(Right(URL,1)~='/')&(Index(URL,'?')=0) THEN
  457.                                 URL2=URL||'/'    /* MAY need to add implicit slash */
  458.                             ELSE
  459.                                 URL2=''
  460.  
  461.                             ExernalLink=0
  462.                             IF Left(URL,Length(BaseURLDir))~==BaseURLDir THEN DO
  463.                                 IF SwBroken=0 THEN
  464.                                     Download=0    /* don't download pages below initial dir */
  465.                                 ELSE
  466.                                     ExternalLink=1    /* do download but no further */
  467.                             END
  468.  
  469.                              /* check if should download this file-type */
  470.                             HTMLfile=0
  471.                             IF (GotSuffix=1)&&(ExternalLink=0) THEN DO    /*never consider external links*/
  472.                                 Suffix=Upper(Left(Suffix,3,' '))
  473.                                  /* as well as always downloading HTML files, also intelligently downloads if suffix same as last user-confirmed download */
  474.                                 IF (Suffix~='HTM')&(Suffix~='SHT')&(Suffix~='SH ')&(Suffix~=LastSuffix) THEN DO
  475.                                     Ask=1
  476.                                     Arc=0; Pic=0
  477.                                     IF (Suffix='LZX')|(Suffix='LHA')|(Suffix='ZIP')|(Suffix='LZH')|(Suffix='ZOO') THEN Arc=1
  478.                                     IF (Suffix='GIF')|(Suffix='JPG')|(Suffix='JPE')|(Suffix='PNG')|(Suffix='JFI') THEN Pic=1
  479.  
  480.                                     IF (Arc=1)&(SwArc=1) THEN Ask=0
  481.                                     IF (Pic=1)&(SwPic=1) THEN Ask=0
  482.  
  483.                                     IF Download=1 THEN DO
  484.                                         IF Ask=1 THEN DO
  485.                                             IF SwNoAsk=1 THEN
  486.                                                 Download=0
  487.                                             ELSE DO
  488.                                                 Say 'QUERY:  Download file "'||URL||'"?'
  489.                                                 DO Until Input~=''
  490.                                                     Pull Input
  491.                                                 END
  492.                                                 IF Left(Input,1)='N' THEN
  493.                                                     Download=0
  494.                                                 ELSE
  495.                                                     LastSuffix=Suffix
  496.                                             END
  497.                                         END
  498.                                     END
  499.                                   END
  500.                                 ELSE
  501.                                     HTMLfile=1
  502.                             END
  503.  
  504.                             IF FileURL=URL  THEN Download=0        /* avoid self-referencing infinite loops */
  505. /*IF Download=1 THEN DO
  506. IF URL2='' THEN
  507.     Say '--Final URL="'||URL||'", Download='||Download
  508. ELSE
  509.     Say '--Final URL="'||URL||'"(/), Download='||Download
  510. END*/
  511.  
  512.                             IF Download=1 THEN DO
  513.                                  /* store URL in list */
  514.                                 URL=Strip(URL,'T')
  515.                                 INTERPRET 'URLListSize='||URLList||'0 + 1'
  516.                                 INTERPRET URLList||'0 = URLListSize'
  517.                                 INTERPRET URLList||URLListSize||' = URL'
  518.  
  519.                                 INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  520.                                 INTERPRET URLList||URLListSize||'.EXT = ExternalLink'    /* record is external link */
  521.  
  522.                                  /* add 2nd possible interpretation of URL to list*/
  523.                                 IF URL2~='' THEN DO
  524.                                     URL2=Strip(URL2,'T')
  525.                                     INTERPRET 'URLListSize='||URLList||'0 + 1'
  526.                                     INTERPRET URLList||'0 = URLListSize'
  527.                                     INTERPRET URLList||URLListSize||' = URL2'
  528.  
  529.                                     INTERPRET URLList||URLListSize||'.HTML = HTMLfile'    /* record whether file should be scanned! */
  530.                                     IF ExternalLink=1 THEN INTERPRET URLList||URLListSize||'.EXT = 1' /* record is external link */
  531.                                 END
  532.                             END
  533.                         END
  534.                      END
  535. /*                    ELSE DO
  536.                         IF SwTerse=0 THEN DO
  537.                             IF SwNoPause=0 THEN DO
  538.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'" (press <return>)'
  539.                                 Pull Input
  540.                              END
  541.                             ELSE
  542.                                 Say 'WARNING:  Empty URL at pos '||NewPos||' in line "'||Line||'"'
  543.                         END
  544.  
  545.                     END
  546. */
  547.                 END
  548.             END
  549.         END
  550.         CALL Close(.file)
  551.     END
  552. Return
  553.  
  554. GetHTML: PROCEDURE
  555.     /* GetHTML(TheURL,File) */
  556.      /* grab args */
  557.     TheURL=Arg(1)
  558.     File=Arg(2)
  559.  
  560.      /* download file */
  561.     SET OUTFILE File
  562.     SET URL TheURL
  563.     START
  564.     Working=1
  565.     DO WHILE Working>0
  566.         CALL Delay(50)
  567.         QUERY FINISHED
  568.         Working=Result
  569.     END
  570. Return
  571.